# This file estimates if pretreatment outcomes predict twinning conditional on 
# age fe 
# we do it on a yearly basis, so we dont need cohort fes 

install.packages("dplyr")
install.packages("date")
install.packages("AER")
install.packages("rio")

library(dplyr)
library(date)
library(AER)
library(rio)

# Load data
load("mothers.rdata")

# Filter of if age of oldest is unkwown (select parents )
mothers <-
  mothers %>%
  filter(!is.na(age_oldest)) 

# Load data
load("fathers.rdata")

# Filter of if age of oldest is unkwown (select parents )
fathers <-
  fathers %>%
  filter(!is.na(age_oldest)) 

robust_coefs <- data_frame()

for(year in c(2007:2014)){
  
  # subset to parent with children born each year from 2010 and onwards 
  mothers_year <- 
    mothers %>%
    filter(age_oldest >= mdy.date(01, 01, year - 10) &
             age_oldest < mdy.date(01, 01, year - 9)) 
  
  mothers_year <-
    mothers_year  %>%
    mutate(age = ntile(date, 20),
           twin_first = twin_first > 0)
  
  # Load and recode education data
  
  udd <- import(paste("udda", year - 1, ".dta", sep = ""))
    
  mothers_year <-
    left_join(mothers_year, udd, by = "pnr") %>% 
    mutate(primary_school = hfaudd_hovedgruppe == 10, 
           high_school    = hfaudd_hovedgruppe == 20 | hfaudd_hovedgruppe == 25,
           vocational     = hfaudd_hovedgruppe == 35, 
           short_tertiary = hfaudd_hovedgruppe == 40, 
           bachelor       = hfaudd_hovedgruppe == 50 | hfaudd_hovedgruppe == 60,
           masters_phd    = hfaudd_hovedgruppe == 65 | hfaudd_hovedgruppe == 70)
  
  # Load and recode income data
  
  ind <- import(paste("indh", year - 1, ".dta", sep = ""))
  
  mothers_year <-
    left_join(mothers_year, ind, by = "pnr") %>% 
    mutate(log_income = ifelse(perindkialt > 0, log(perindkialt), 0)) 

  # Load and recode hospitalization data
  
  lpr <- import(paste("lpradm", year - 2,".sas7bdat", sep = ""))
  
  lpr <- 
    lpr %>% 
    filter(!is.na(V_SENGDAGE)) %>% 
    group_by(pnr) %>% 
    summarise(logbeddays = log(sum(V_SENGDAGE)))
      
  mothers_year <-
    left_join(mothers_year, lpr, by = "pnr")  %>% 
    mutate(logbeddays = ifelse(!is.na(logbeddays), logbeddays, 0))
  
  # Load population register 
  
  bef <- import(paste("bef", year - 1, ".dta", sep = ""))
  
  mothers_year <-
    left_join(mothers_year, bef, by = "pnr") %>% 
    mutate(danish = IE_TYPE == 1)

  # Load and recode employment data
  
  ras <- import(paste("ras", year - 2, ".dta", sep = ""))
  
  if(year <= 2009){
    mothers_year <-
      left_join(mothers_year, ras, by = "pnr") %>% 
      mutate(SOCSTIL_KODE = if_else(is.na(SOCSTIL_KODE), 500, SOCSTIL_KODE)) %>% 
      group_by(pnr) %>% 
      mutate(SOCSTIL_KODE_min = min(SOCSTIL_KODE)) %>% 
      filter(SOCSTIL_KODE_min == SOCSTIL_KODE) %>% 
      sample_n(1) %>% 
      mutate(full_time = SOCSTIL_KODE < 200)
  }
   
  if(year > 2009){
    mothers_year <-
      left_join(mothers_year, ras, by = "pnr") %>% 
      mutate(SOC_STATUS_KODE = if_else(is.na(SOC_STATUS_KODE), 500, SOC_STATUS_KODE)) %>% 
      group_by(pnr) %>% 
      mutate(SOC_STATUS_KODE_min = min(SOC_STATUS_KODE)) %>% 
      filter(SOC_STATUS_KODE_min == SOC_STATUS_KODE) %>% 
      sample_n(1) %>% 
      mutate(full_time = SOC_STATUS_KODE < 200)
  }
  
  # run models on education 
  mod_edu1 <- lm(primary_school ~ twin_first + as.factor(age), data = mothers_year)
  mod_edu2 <- lm(high_school    ~ twin_first + as.factor(age), data = mothers_year)
  mod_edu3 <- lm(vocational     ~ twin_first + as.factor(age), data = mothers_year)
  mod_edu4 <- lm(short_tertiary ~ twin_first + as.factor(age), data = mothers_year)
  mod_edu5 <- lm(bachelor       ~ twin_first + as.factor(age), data = mothers_year)
  mod_edu6 <- lm(masters_phd    ~ twin_first + as.factor(age), data = mothers_year)
  
  # run model on income
  mod_inco <- lm(log_income ~ twin_first + as.factor(age), data = mothers_year)
  
  # run model on full time employment
  mod_full <- lm(full_time ~ twin_first + as.factor(age), data = mothers_year)
  
  # run model on ethnicity
  mod_dane <- lm(danish ~ twin_first + as.factor(age), data = mothers_year)
  
  # run model on hospitalization
  mod_lbed <- lm(logbeddays  ~ twin_first + as.factor(age), data = mothers_year)
  
  # store estimates
  coefs <- data_frame(year = year,
                      coef = c(summary(mod_edu1)$coefficients[2,1],
                               summary(mod_edu2)$coefficients[2,1],
                               summary(mod_edu3)$coefficients[2,1],
                               summary(mod_edu4)$coefficients[2,1],
                               summary(mod_edu5)$coefficients[2,1],
                               summary(mod_edu6)$coefficients[2,1],
                               summary(mod_inco)$coefficients[2,1],
                               summary(mod_full)$coefficients[2,1],
                               summary(mod_dane)$coefficients[2,1],
                               summary(mod_lbed)$coefficients[2,1]),
                      se   = c(summary(mod_edu1)$coefficients[2,2],
                               summary(mod_edu2)$coefficients[2,2],
                               summary(mod_edu3)$coefficients[2,2],
                               summary(mod_edu4)$coefficients[2,2],
                               summary(mod_edu5)$coefficients[2,2],
                               summary(mod_edu6)$coefficients[2,2],
                               summary(mod_inco)$coefficients[2,2],
                               summary(mod_full)$coefficients[2,2],
                               summary(mod_dane)$coefficients[2,2],
                               summary(mod_lbed)$coefficients[2,2]),
                      labe = c("Primary school",
                               "High school",
                               "Vocational training",
                               "Short tertiary",
                               "Mediumlength tertiary/bachelor",
                               "Masters or PhD",
                               "Personal income (log)",
                               "Full time employment",
                               "Danish origin",
                               "Days in hospital (log)"),
                      plac = c(1,2,3,4,5,6,8,10,12, 14), 
                      parent = "Mother", 
                      fe     = "YES")
  
  robust_coefs <- 
    bind_rows(robust_coefs, coefs)

  # repeat without fixed effects    
  
  mod_edu1 <- lm(primary_school ~ twin_first, data = mothers_year)
  mod_edu2 <- lm(high_school    ~ twin_first, data = mothers_year)
  mod_edu3 <- lm(vocational     ~ twin_first, data = mothers_year)
  mod_edu4 <- lm(short_tertiary ~ twin_first, data = mothers_year)
  mod_edu5 <- lm(bachelor       ~ twin_first, data = mothers_year)
  mod_edu6 <- lm(masters_phd    ~ twin_first, data = mothers_year)
  
  mod_inco <- lm(log_income ~ twin_first, data = mothers_year)
  
  mod_full <- lm(full_time ~ twin_first, data = mothers_year)
  
  mod_dane <- lm(danish ~ twin_first, data = mothers_year)
  
  mod_lbed <- lm(logbeddays  ~ twin_first, data = mothers_year)
  
  coefs <- data_frame(year = year,
                      coef = c(summary(mod_edu1)$coefficients[2,1],
                               summary(mod_edu2)$coefficients[2,1],
                               summary(mod_edu3)$coefficients[2,1],
                               summary(mod_edu4)$coefficients[2,1],
                               summary(mod_edu5)$coefficients[2,1],
                               summary(mod_edu6)$coefficients[2,1],
                               summary(mod_inco)$coefficients[2,1],
                               summary(mod_full)$coefficients[2,1],
                               summary(mod_dane)$coefficients[2,1],
                               summary(mod_lbed)$coefficients[2,1]),
                      se   = c(summary(mod_edu1)$coefficients[2,2],
                               summary(mod_edu2)$coefficients[2,2],
                               summary(mod_edu3)$coefficients[2,2],
                               summary(mod_edu4)$coefficients[2,2],
                               summary(mod_edu5)$coefficients[2,2],
                               summary(mod_edu6)$coefficients[2,2],
                               summary(mod_inco)$coefficients[2,2],
                               summary(mod_full)$coefficients[2,2],
                               summary(mod_dane)$coefficients[2,2],
                               summary(mod_lbed)$coefficients[2,2]),
                      labe = c("Primary school",
                               "High school",
                               "Vocational training",
                               "Short tertiary",
                               "Mediumlength tertiary/bachelor",
                               "Masters or PhD",
                               "Personal income (log)",
                               "Full time employment",
                               "Danish origin",
                               "Days in hospital (log)"),
                      plac = c(1,2,3,4,5,6,8,10,12, 14), 
                      parent = "Mother", 
                      fe     = "NO")
  
  robust_coefs <- 
    bind_rows(robust_coefs, coefs)
  
# repeat for fathers ------------------------------------------------------

  # subset to parent with children born each year from 2010 and onwards 
  fathers_year <- 
    fathers %>%
    filter(age_oldest >= mdy.date(01, 01, year - 10) &
             age_oldest < mdy.date(01, 01, year - 9)) 
  
  fathers_year <-
    fathers_year  %>%
    mutate(age = ntile(date, 20),
           twin_first = twin_first > 0)
  
  fathers_year <-
    left_join(fathers_year, udd, by = "pnr") %>% 
    mutate(primary_school = hfaudd_hovedgruppe == 10, 
           high_school    = hfaudd_hovedgruppe == 20 | hfaudd_hovedgruppe == 25,
           vocational     = hfaudd_hovedgruppe == 35, 
           short_tertiary = hfaudd_hovedgruppe == 40, 
           bachelor       = hfaudd_hovedgruppe == 50 | hfaudd_hovedgruppe == 60,
           masters_phd    = hfaudd_hovedgruppe == 65 | hfaudd_hovedgruppe == 70)
  
  fathers_year <-
    left_join(fathers_year, ind, by = "pnr") %>% 
    mutate(log_income = ifelse(perindkialt > 0, log(perindkialt), 0))
  
  fathers_year <-
    left_join(fathers_year, lpr, by = "pnr") %>% 
    mutate(logbeddays = ifelse(!is.na(logbeddays), logbeddays, 0))
  
  fathers_year <-
    left_join(fathers_year, bef, by = "pnr") %>% 
    mutate(danish = IE_TYPE == 1)
  
  if(year <= 2009){
    fathers_year <-
      left_join(fathers_year, ras, by = "pnr") %>% 
      mutate(SOCSTIL_KODE = if_else(is.na(SOCSTIL_KODE), 500, SOCSTIL_KODE)) %>% 
      group_by(pnr) %>% 
      mutate(SOCSTIL_KODE_min = min(SOCSTIL_KODE)) %>% 
      filter(SOCSTIL_KODE_min == SOCSTIL_KODE) %>% 
      sample_n(1) %>% 
      mutate(full_time = SOCSTIL_KODE < 200)
  }
  
  if(year > 2009){
    fathers_year <-
      left_join(fathers_year, ras, by = "pnr") %>% 
      mutate(SOC_STATUS_KODE = if_else(is.na(SOC_STATUS_KODE), 500, SOC_STATUS_KODE)) %>% 
      group_by(pnr) %>% 
      mutate(SOC_STATUS_KODE_min = min(SOC_STATUS_KODE)) %>% 
      filter(SOC_STATUS_KODE_min == SOC_STATUS_KODE) %>% 
      sample_n(1) %>% 
      mutate(full_time = SOC_STATUS_KODE < 200)
  }
  
  mod_edu1 <- lm(primary_school ~ twin_first + as.factor(age), data = fathers_year)
  mod_edu2 <- lm(high_school    ~ twin_first + as.factor(age), data = fathers_year)
  mod_edu3 <- lm(vocational     ~ twin_first + as.factor(age), data = fathers_year)
  mod_edu4 <- lm(short_tertiary ~ twin_first + as.factor(age), data = fathers_year)
  mod_edu5 <- lm(bachelor       ~ twin_first + as.factor(age), data = fathers_year)
  mod_edu6 <- lm(masters_phd    ~ twin_first + as.factor(age), data = fathers_year)
  
  mod_inco <- lm(log_income ~ twin_first + as.factor(age), data = fathers_year)
  
  mod_full <- lm(full_time ~ twin_first + as.factor(age), data = fathers_year)
  
  mod_dane <- lm(danish ~ twin_first + as.factor(age), data = fathers_year)
  
  mod_lbed <- lm(logbeddays ~ twin_first + as.factor(age), data = fathers_year)
  
  coefs <- data_frame(year = year,
                      coef = c(summary(mod_edu1)$coefficients[2,1],
                               summary(mod_edu2)$coefficients[2,1],
                               summary(mod_edu3)$coefficients[2,1],
                               summary(mod_edu4)$coefficients[2,1],
                               summary(mod_edu5)$coefficients[2,1],
                               summary(mod_edu6)$coefficients[2,1],
                               summary(mod_inco)$coefficients[2,1],
                               summary(mod_full)$coefficients[2,1],
                               summary(mod_dane)$coefficients[2,1],
                               summary(mod_lbed)$coefficients[2,1]),
                      se   = c(summary(mod_edu1)$coefficients[2,2],
                               summary(mod_edu2)$coefficients[2,2],
                               summary(mod_edu3)$coefficients[2,2],
                               summary(mod_edu4)$coefficients[2,2],
                               summary(mod_edu5)$coefficients[2,2],
                               summary(mod_edu6)$coefficients[2,2],
                               summary(mod_inco)$coefficients[2,2],
                               summary(mod_full)$coefficients[2,2],
                               summary(mod_dane)$coefficients[2,2],
                               summary(mod_lbed)$coefficients[2,2]),
                      labe = c("Primary school",
                               "High school",
                               "Vocational training",
                               "Short tertiary",
                               "Mediumlength tertiary/bachelor",
                               "Masters or PhD",
                               "Personal income (log)",
                               "Full time employment",
                               "Danish origin",
                               "Days in hospital (log)"),
                      plac = c(1,2,3,4,5,6,8,10,12, 14), 
                      parent = "Father",
                      fe = "YES")
  
  robust_coefs <- 
    bind_rows(robust_coefs, coefs)
  
  # repeat without FEs
  
  mod_edu1 <- lm(primary_school ~ twin_first, data = fathers_year)
  mod_edu2 <- lm(high_school    ~ twin_first, data = fathers_year)
  mod_edu3 <- lm(vocational     ~ twin_first, data = fathers_year)
  mod_edu4 <- lm(short_tertiary ~ twin_first, data = fathers_year)
  mod_edu5 <- lm(bachelor       ~ twin_first, data = fathers_year)
  mod_edu6 <- lm(masters_phd    ~ twin_first, data = fathers_year)
  
  mod_inco <- lm(log_income ~ twin_first, data = fathers_year)
  
  mod_full <- lm(full_time ~ twin_first, data = fathers_year)
  
  mod_dane <- lm(danish ~ twin_first, data = fathers_year)
  
  mod_lbed <- lm(logbeddays ~ twin_first, data = fathers_year)
  
  coefs <- data_frame(year = year,
                      coef = c(summary(mod_edu1)$coefficients[2,1],
                               summary(mod_edu2)$coefficients[2,1],
                               summary(mod_edu3)$coefficients[2,1],
                               summary(mod_edu4)$coefficients[2,1],
                               summary(mod_edu5)$coefficients[2,1],
                               summary(mod_edu6)$coefficients[2,1],
                               summary(mod_inco)$coefficients[2,1],
                               summary(mod_full)$coefficients[2,1],
                               summary(mod_dane)$coefficients[2,1],
                               summary(mod_lbed)$coefficients[2,1]),
                      se   = c(summary(mod_edu1)$coefficients[2,2],
                               summary(mod_edu2)$coefficients[2,2],
                               summary(mod_edu3)$coefficients[2,2],
                               summary(mod_edu4)$coefficients[2,2],
                               summary(mod_edu5)$coefficients[2,2],
                               summary(mod_edu6)$coefficients[2,2],
                               summary(mod_inco)$coefficients[2,2],
                               summary(mod_full)$coefficients[2,2],
                               summary(mod_dane)$coefficients[2,2],
                               summary(mod_lbed)$coefficients[2,2]),
                      labe = c("Primary school",
                               "High school",
                               "Vocational training",
                               "Short tertiary",
                               "Mediumlength tertiary/bachelor",
                               "Masters or PhD",
                               "Personal income (log)",
                               "Full time employment",
                               "Danish origin",
                               "Days in hospital (log)"),
                      plac = c(1,2,3,4,5,6,8,10,12, 14), 
                      parent = "Father",
                      fe = "NO")
  
  robust_coefs <- 
    bind_rows(robust_coefs, coefs)
  
  print(Sys.time())
  print(year)
}

write.table(robust_coefs, "mechanism.txt")

  